# __author__ = 'heyin'
# __date__ = '2018/11/12 9:36'
# knn算法的实现
import numpy as np
import pandas as pd
from pyecharts import Scatter3D, Scatter, Line
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.datasets import load_iris, load_boston


def knn_1():
    # 准备数据，利用numpy生成
    train = None
    np.random.seed(1)  # 指定随机种子，用于每次生成同一套数据
    for i in range(5):
        x = np.random.randint(200 * i - 100, 200 * (i + 1), (200, 3))  # 生成整数
        y = np.random.randint(i, i + 1, (200, 1))
        d = np.hstack((x, y))
        if i == 0:
            train = d
        else:
            train = np.vstack((train, d))

    # 将原始点绘制在3d图上
    ceshidian = list()
    for j in range(10):
        ceshidian.append([1000, 111 * (j + 1), 111 * (j + 1)])

    range_color = [
        '#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
        '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
    scatter3D = Scatter3D("3D 散点图示例", width=1200, height=600)
    scatter3D.add("", train[:, 0:-1], is_visualmap=True, visual_range_color=range_color)
    scatter3D.add('', ceshidian, is_visualmap=True, visual_range_color=range_color)
    scatter3D.render(path='./echart_html/3dknn自设数据.html')

    # 划分测试集和训练集，针对未知数据，进行预测时，肯定也要对数据进行标准化
    # 但是以什么标准进行标准化？，应当是训练集采用的标准化参数

    x = train[:, 0:-1].astype(np.float64)
    y = train[:, -1]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    # 利用knn计算需要进行标准化或者归一化处理，由于归一化受最大值最小值影响较大，因此采用标准化
    sd = StandardScaler()
    x_train = sd.fit_transform(x_train)
    x_test = sd.transform(x_test)

    knc = KNeighborsClassifier(n_neighbors=3)
    knc.fit(x_train, y_train)

    y_pred = knc.predict(x_test)
    print('训练集进行预测score是', knc.score(x_train, y_train))
    # 准确率
    print('测试机进行预测score是：', knc.score(x_test, y_test))
    # 精确率，预测结果是正例的结果中，真正的正例占比
    ret = classification_report(y_test, y_pred)
    print(ret)

    # 通过交叉验证和网格搜索进行超参数调优
    # 构造超参数n_neighbors
    kn = KNeighborsClassifier()  # 此处不再传递参数
    params = {'n_neighbors': [3, 5, 7, 9, 11]}
    gscv = GridSearchCV(kn, params, cv=5)
    gscv.fit(x_train, y_train)
    pred = gscv.predict(x_test)
    print('在交叉验证当中最好的结果：', gscv.best_score_)
    print("每个超参数每次交叉验证的结果：", gscv.cv_results_)
    print("选择最好的模型是：", gscv.best_estimator_)
    print(gscv.best_params_)
    print('交叉验证和网格搜索测试集预测score', gscv.score(x_test, y_test))
    print('交叉验证和网格搜索训练集预测score', gscv.score(x_train, y_train))
    print('交叉验证和网格搜索预测精确率和召回率', classification_report(y_test, pred))

    # 随便给一个数，预测其分类
    # 测试数据标准化，不进行标准化，预测错的离谱
    ceshidian = sd.transform(ceshidian)
    print(gscv.predict(ceshidian))


def knn_iris():
    # 利用自带的数据集进行knn的预测
    iris_set = load_iris()
    # 取出特征和目标
    x = iris_set.data
    y = iris_set.target
    target_names = iris_set.target_names
    print(x)
    print(y)
    print(target_names)
    print(iris_set.feature_names)

    # 从特征值中分别取出三个种类的数据
    # a = 0
    # b = 0
    # c = 0
    # for i in y:
    #     if i == 0:
    #         a +=1
    #     elif i == 1:
    #         b += 1
    #     elif i == 2:
    #         c +=1
    # print(a,b,c)
    # print(x)
    # a = x[0:50, :]
    # b = x[50:100, :]
    # c = x[100:150, :]
    # # 将特征值绘制
    # scatter3D = Scatter3D("所有数据点", width=1200, height=600)
    # scatter3D.add("a", a)
    # scatter3D.add("b", b)
    # scatter3D.add("c", c)
    # scatter3D.render(path='./echart_html/3dknn_iris.html')
    #
    # # 划分训练集和测试集
    # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=3)
    # # 数据标准化
    # sd = StandardScaler()
    # x_train = sd.fit_transform(x_train)
    # x_test = sd.transform(x_test)
    # # 创建估计器
    # kn = KNeighborsClassifier(n_neighbors=4)
    # kn.fit(x_train, y_train)
    # y_pred = kn.predict(x_test)
    # print('测试集准确率：', kn.score(x_test, y_test))
    # print('训练集准确率：', kn.score(x_train, y_train))
    # print('精确率和召回率：', classification_report(y_test, y_pred, target_names=target_names))
    #
    # # 采用交叉验证和网格搜索进行超参数调优
    # knn = KNeighborsClassifier()
    # params = {'n_neighbors': [3, 4, 5, 6, 7, 8]}
    # gs = GridSearchCV(knn, params, cv=3)
    # gs.fit(x_train, y_train)
    # print('gs测试集准确率：', gs.score(x_test, y_test))
    # print('gs训练集准确率：', gs.score(x_train, y_train))
    # print('gs精确率和召回率：', classification_report(y_test, gs.predict(x_test), target_names=target_names))
    # print(gs.best_params_)
    # print(gs.best_score_)  # 最佳的参数时，交叉验证的平均值，和gs.score得到的值不是一个东西
    # print(gs.best_estimator_)
    # print(gs.cv_results_)
    #
    # print(gs.predict_proba(x_test))
    #
    # # 绘制一个测试集和训练集分开的图
    # scatter3D = Scatter3D("训练集与测试集", width=1200, height=600)
    # scatter3D.add("训练集", sd.inverse_transform(x_train))
    # scatter3D.add("测试集", sd.inverse_transform(x_test))
    # scatter3D.render(path='./echart_html/3dknn_iris_train_test.html')


def bubble_sort(nums, nums1):
    for i in range(len(nums) - 1):  # 这个循环负责设置冒泡排序进行的次数
        for j in range(len(nums) - i - 1):  # ｊ为列表下标
            if nums[j] > nums[j + 1]:
                nums[j], nums[j + 1] = nums[j + 1], nums[j]
                nums1[j], nums1[j + 1] = nums1[j + 1], nums1[j]
    return nums, nums1


def knn_huigui():
    boston = load_boston()
    # print(boston)
    x = boston.data
    y = boston.target
    # 划分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=666)
    # 数据标准化
    std = StandardScaler()
    x_train = std.fit_transform(x_train)
    x_test = std.transform(x_test)

    kn1 = KNeighborsRegressor(n_neighbors=4, weights='uniform')
    kn1.fit(x_train, y_train)
    y_pred = kn1.predict(x_test)

    # 采用均方误差进行回归的性能评估，无法直观看到效果如何，可以绘图展示效果
    print('均方误差为：', mean_squared_error(y_test, y_pred))

    # 绘制折线图展示
    x_axis = list(range(1, y_test.shape[0] + 1))
    line = Line("knn回归算法结果对比图", width=1200)
    y_test_s, y_pred_s = bubble_sort(y_test, y_pred)
    line.add("真实值", x_axis, y_test_s, is_smooth=True)
    line.add("预测值", x_axis, y_pred_s, is_smooth=True)
    line.render(path='./echart_html/knn回归结果对比折线图平均值.html')

    # distance
    kn1 = KNeighborsRegressor(n_neighbors=4, weights='distance')
    kn1.fit(x_train, y_train)
    y_pred = kn1.predict(x_test)

    # 采用均方误差进行回归的性能评估，无法直观看到效果如何，可以绘图展示效果
    print('均方误差为：', mean_squared_error(y_test, y_pred))

    # 绘制折线图展示
    line = Line("knn回归算法结果对比图", width=1200)
    y_test_s, y_pred_s = bubble_sort(y_test, y_pred)
    line.add("真实值", x_axis, y_test_s, is_smooth=True)
    line.add("预测值", x_axis, y_pred_s, is_smooth=True)
    line.render(path='./echart_html/knn回归结果对比折线图加权值.html')


def astock():
    # 从csv文件获取数据
    df = pd.read_csv('./stockdata/sh.csv')
    df.pop('date')
    y = df.pop('up_down')
    x = df
    # 特征工程需要拆分训练集和测试集后进行
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
    # 数据标准化处理
    std = StandardScaler()
    x_train = std.fit_transform(x_train)
    x_test = std.transform(x_test)
    # print(x_train)
    # print(x_test)
    for i in range(2, 11):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(x_train, y_train)
        y_pred = knn.predict(x_test)
        print(i, '训练集score', knn.score(x_train, y_train))
        print(i, '测试集score', knn.score(x_test, y_test))
        print(classification_report(y_test, y_pred, labels=[0, 1], target_names=['跌', '涨']))


if __name__ == '__main__':
    # knn_1()
    knn_iris()
    # knn_huigui()
    # astock()
